Author:Xiaofan Lu
Reviewer:Ying Ge、Junyi Shen
过配对差异表达,画亚型特异性上(下)调基因的对角热图。比如样本分了4-5个组,分析并画出每个组里面上调或下调的热图。Diagonal heat maps of subtype-specific up(down)regulated genes were drawn by pairing differential expression. For example, the sample is divided into 4-5 groups, and the heat map within each group is analyzed and drawn.
Figure 2: A) Unsupervised clustering of lncRNAs identified 4 clusters: cluster I (related to the basal-like breast cancer subtype), cluster II (related to the HER-2 enriched subtype), cluster III (related to luminal A subtype), and cluster IV (related to luminal A and B subtypes). Correlation with PAM50 classification, estrogen receptor (ER), progesterone receptor (PR) and HER2 status are depicted.
在大于等于3组亚型的基础上,得到配对差异表达基因后,计算每个亚型特异性上/下调的基因集,并绘制特异性基因表达热图。On the basis of more than or equal to 3 isotypes, the gene sets specifically up/down for each subtype were calculated, and the specific gene expression heat maps were plotted.
使用国内镜像安装包Use the domestic mirror installation package
options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
加载包Load the package
library(ClassDiscovery) # 距离测量Distance measurement
## Loading required package: cluster
## Loading required package: oompaBase
library(edgeR) # 差异表达Differential expression
## Loading required package: limma
library(NMF) # 绘制热图Draw a heat map
## Loading required package: pkgmaker
## Loading required package: registry
## Loading required package: rngtools
## NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 3/4
## To enable shared memory capabilities, try: install.extras('
## NMF
## ')
library(gplots) # 热图颜色Heatmap colors
##
## Attaching package: 'gplots'
## The following object is masked from 'package:oompaBase':
##
## redgreen
## The following object is masked from 'package:stats':
##
## lowess
library(RColorBrewer) # 热图颜色Heatmap colors
Sys.setenv(LANGUAGE = "en") #显示英文报错信息Displays the error message in English
options(stringsAsFactors = FALSE) #禁止chr转成factor Prohibiting CHR from converting to factor
自定义配对比较函数Custom pairing comparison functions
这里用edgeR,如果用DESeq2或limma,就把FigureYa118MulticlassDESeq2或FigureYa119MulticlassLimma里面相应的函数替换过来,可无缝对接。这里用edgeR,如果用DESeq2或limma,就把FigureYa118MulticlassDESeq2或FigureYa119MulticlassLimma里面相应的函数替换过来,可无缝对接。
# 创建需要配对比较的列表Create a list that needs to be paired and compared
createList <- function(group=NULL) {
tumorsam <- names(group)
sampleList = list()
treatsamList =list()
treatnameList <- c()
ctrlnameList <- c()
#A-1: 类1 vs 其他Class 1 vs Other
sampleList[[1]] = tumorsam
treatsamList[[1]] = intersect(tumorsam, names(group[group=="Cluster1"])) # 亚型名称需要根据情况修改
treatnameList[1] <- "Cluster1" # 该亚型的命名
ctrlnameList[1] <- "Others" # 其他亚型的命名
#A-2: 类2 vs 其他Class 2 vs Other
sampleList[[2]] = tumorsam
treatsamList[[2]] = intersect(tumorsam, names(group[group=="Cluster2"]))
treatnameList[2] <- "Cluster2"
ctrlnameList[2] <- "Others"
#A-3: 类3 vs 其他Class 3 vs Other
sampleList[[3]] = tumorsam
treatsamList[[3]] = intersect(tumorsam, names(group[group=="Cluster3"]))
treatnameList[3] <- "Cluster3"
ctrlnameList[3] <- "Others"
#A-4: 类4 vs 其他Class 4 vs Other
sampleList[[4]] = tumorsam
treatsamList[[4]] = intersect(tumorsam, names(group[group=="Cluster4"]))
treatnameList[4] <- "Cluster4"
ctrlnameList[4] <- "Others"
return(list(sampleList, treatsamList, treatnameList, ctrlnameList))
}
# 配对pairing edgeR
twoclassedgeR <- function(res.path=NULL, countsTable=NULL, prefix=NULL, complist=NULL, overwt=FALSE) {
#Groupinfo could contain "batch", which will be considered by edgeR design matrix
sampleList <- complist[[1]]
treatsamList <- complist[[2]]
treatnameList <- complist[[3]]
ctrlnameList <- complist[[4]]
allsamples <- colnames(countsTable)
options(warn=1)
for (k in 1:length(sampleList)) { # 循环读取每一次比较的内容Loop through the contents of each comparison
samples <- sampleList[[k]]
treatsam <- treatsamList[[k]]
treatname <- treatnameList[k]
ctrlname <- ctrlnameList[k]
compname <- paste(treatname, "_vs_", ctrlname, sep="") # 生成最终文件名Generate the final file name
tmp = rep("others", times=length(allsamples))
names(tmp) <- allsamples
tmp[samples]="control"
tmp[treatsam]="treatment"
outfile <- file.path( res.path, paste(prefix, "_edgeR_test_result.", compname, ".txt", sep="") )
if (file.exists(outfile) & (overwt==FALSE)) { # 因此差异表达分析较慢,因此如果文件存在,在不覆盖的情况下(overwt=F)不再次计算差异表达Therefore, differential expression analysis is slow, so if the file exists, it is not calculated again without overwriting (overwt=F).
cat(k, ":", compname, "exists and skipped;\n")
next
}
saminfo <- data.frame("Type"=tmp[samples],"SampleID"=samples,stringsAsFactors = F)
group=factor(saminfo$Type,levels = c("control","treatment"))
design <- model.matrix(~group) # 设计矩阵仅包含亚型信息,若有批次效应请修改,例如The design matrix contains only subtype information, if there is a batch effect, please modify it, for example design <- model.matrix(~group+treat)
rownames(design) <- samples
# 差异表达过程,具体参数细节及输出结果解释,请参阅相关For the process of expressing the difference, details of the specific parameters, and explanation of the output results, please refer to the related section document
y <- DGEList(counts=countsTable[,samples],group=saminfo$Type)
y <- calcNormFactors(y)
y <- estimateDisp(y, design, robust=TRUE)
fit <- glmFit(y, design)
lrt <- glmLRT(fit)
ordered_tags <- topTags(lrt, n=100000)
allDiff=ordered_tags$table
allDiff=allDiff[is.na(allDiff$FDR)==FALSE,]
diff=allDiff
diff$id <- rownames(diff)
res <- diff[,c("id","logFC","logCPM","LR","PValue","FDR")]
colnames(res) <- c("id","log2FC","logCPM","LR","PValue","FDR")
write.table(res, file=outfile, row.names=F, col.names=T, sep="\t", quote=F)
cat(k, ",")
}
options(warn=0)
}
自定义画图函数Custom drawing functions
# 特异性基因计算和热图绘制Specific gene calculations and heat mapping
sigheat <- function(featdata=NULL, # 表达谱,注意一般是标准化后的FPKM或者TPM,此时norm选择none Expression spectrum, note that it is generally standardized FPKM or TPM, in this case norm chooses none
DEfiles=NULL, # 差异表达结果文件名Differential expression result file name
outfile=NULL, # 输出文件名Output file name
hcs=NULL, # 聚类树结构Cluster tree structure
heatCol=greenred(128), #热图配色Heatmap color matching
# 如果你喜欢橙黄蓝,可以换成下面这行If you like orange, yellow and blue, you can change to the following line
#heatCol=colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100),
annCol=NULL, # 样本注释信息Sample annotation information
annColors=NULL, # 注释信息的颜色The color of the annotation information
res.path=NULL, # 存储特异性表达的基因集的路径Pathways to store specifically expressed gene sets
fig.path=NULL, # 存储热图的路径The path to store the heatmap
halfwidth=3, # 表达谱标准化后的上下限cutoff The upper and lower limits of the expression spectrum are normalized and cutoff
padjcut=0.05, # fdr的阈值Threshold for FDR
log2fccut=1, # log2fc的阈值,默认为1 log2fc, the default is 1
height=8, # 热图图片高度Heatmap image height
width=8, # 热图的宽度The width of the heat map
norm="none", # 输入数据是否需要标准化,默认不需要,可选择quantile,或者median标准化Whether the input data needs to be normalized, it is not required by default, and you can choose quantile or median normalization
dirct="UP", # 特异性基因的表达方向,默认为上调(推荐),可选择UP或DOWN The expression direction of specific genes is up-regulated by default (recommended), and UP or DOWN can be selected
fontsize=8, # 热图的字号The font size of the heatmap
labRow = F, # 热图是否显示基因名Whether the heatmap shows the gene name
labCol = F) { # 热图是否显示样本名Whether the heatmap shows the sample name
samabsFlag <- FALSE
if (is.null(hcs)) {samabsFlag <- TRUE}
if(is.null(DEfiles)) {stop("DEfiles is NULL!")}
if (!is.element(norm, c("none", "median", "quantile"))) {stop( "norm type error!") }
if (!is.element(dirct, c("UP", "DOWN"))) {stop( "dirct type error!") }
if(dirct=="UP") { outlabel <- "uniquely_significantly_overexpressed.txt" }
if(dirct=="DOWN") { outlabel <- "uniquely_significantly_underexpressed.txt" }
genelist <- c()
for (filek in DEfiles) {
DEres <- read.table(file.path(res.path, filek), header=T, row.names=NULL, sep="\t", quote="", stringsAsFactors=F)
DEres <- DEres[!duplicated(DEres[, 1]),]
DEres <- DEres[!is.na(DEres[, 1]), ]
rownames(DEres) <- DEres[, 1]
DEres <- DEres[, -1]
if (dirct=="UP") {
genelist <- c( genelist, rownames(DEres[!is.na(DEres$FDR) & DEres$FDR < padjcut & !is.na(DEres$log2FC) & DEres$log2FC > log2fccut, ]) )
}
if (dirct=="DOWN") {
genelist <- c( genelist, rownames(DEres[!is.na(DEres$FDR) & DEres$FDR < padjcut & !is.na(DEres$log2FC) & DEres$log2FC < -log2fccut, ]) )
}
}
unqlist <- setdiff(genelist,genelist[duplicated(genelist)])
for (filek in DEfiles) {
DEres <- read.table(file.path(res.path, filek), header=T, row.names=NULL, sep="\t", quote="", stringsAsFactors=F)
DEres <- DEres[!duplicated(DEres[, 1]),]
DEres <- DEres[!is.na(DEres[, 1]), ]
rownames(DEres) <- DEres[, 1]
DEres <- DEres[, -1]
if(dirct=="UP") {
outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$FDR) & DEres$FDR < padjcut & !is.na(DEres$log2FC) & DEres$log2FC > log2fccut, ]) )
}
if(dirct=="DOWN") {
outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$FDR) & DEres$FDR < padjcut & !is.na(DEres$log2FC) & DEres$log2FC < -log2fccut, ]) )
}
write.table(outk, file=file.path(res.path, paste(filek, outlabel, sep="_")), row.names=F, col.names=F, sep="\t", quote=F)
}
if (sum(!is.element(unqlist, rownames(featdata)))>0) {stop("unqlist not found in featdata!")}
samples <- colnames(featdata)
if (norm=="quantile") {
tmp <- normalize.quantiles(as.matrix(featdata))
rownames(tmp) <- rownames(featdata)
colnames(tmp) <- colnames(featdata)
}
if (norm=="median") {
tmp <- sweep(featdata,2, apply(featdata,2,median,na.rm=T))
}
if(norm=="none") {
tmp <- featdata
}
if (samabsFlag) {
subdata <- tmp[unqlist, intersect(samples, colnames(tmp))]
}else{
subdata <- tmp[unqlist, samples]
}
hcg <- hclust(distanceMatrix(as.matrix(t(subdata)), "pearson"), "ward.D")
plotdata <- t(scale(t(subdata)))
plotdata[plotdata > halfwidth] <- halfwidth
plotdata[plotdata < - halfwidth] <- -halfwidth
pdf(file=file.path(fig.path, outfile), height=height, width=width)
Rowv <- NA
if (samabsFlag) {
if (ncol(annCol)<2) {
tmp <- as.data.frame(annCol[colnames(plotdata), ])
rownames(tmp) <- colnames(plotdata)
colnames(tmp) <- colnames(annCol)
}else{
tmp <- annCol[colnames(plotdata), ]
}
if(isTRUE(labRow) & isTRUE(labCol)){
aheatmap(plotdata, Rowv=Rowv, Colv=NA, annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize)
}
if(isTRUE(labRow) & isFALSE(labCol)){
aheatmap(plotdata, Rowv=Rowv, Colv=NA, annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labCol = NA)
}
if(isFALSE(labRow) & isTRUE(labCol)){
aheatmap(plotdata, Rowv=Rowv, Colv=NA, annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labRow = NA)
}
if(isFALSE(labRow) & isFALSE(labCol)){
aheatmap(plotdata, Rowv=Rowv, Colv=NA, annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labCol = NA, labRow = NA)
}
}else{
if (ncol(annCol)<2) {
tmp <- as.data.frame(annCol[samples, ])
rownames(tmp) <- samples
colnames(tmp) <- colnames(annCol)
}else{
tmp <- annCol[samples, ]
}
if(sum(hcs$labels!=colnames(plotdata))>0) {stop("colnames mismatch for aheatmap!")}
if(isTRUE(labRow) & isTRUE(labCol)){
aheatmap(plotdata, Rowv=Rowv, Colv=as.dendrogram(hcs), annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize)
}
if(isTRUE(labRow) & isFALSE(labCol)){
aheatmap(plotdata, Rowv=Rowv, Colv=as.dendrogram(hcs), annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labCol = NA)
}
if(isFALSE(labRow) & isTRUE(labCol)){
aheatmap(plotdata, Rowv=Rowv, Colv=as.dendrogram(hcs), annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labRow = NA)
}
if(isFALSE(labRow) & isFALSE(labCol)){
aheatmap(plotdata, Rowv=Rowv, Colv=as.dendrogram(hcs), annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labCol = NA, labRow = NA)
}
}
invisible(dev.off())
# return(unqlist)
}
easy_input_counts.txt,easy_input_FPKM.txt,表达矩阵,分别是read count和FPKM。这里以read count为输入,用edgeR做配对比较,用FPKM画图。The expression matrix is read count and FPKM. Here, we use read count as input, use edgeR for pairing comparison, and use FPKM to draw a picture
# 设置文件路径Set the file path
workdir <- "."
# 读取SKCM表达谱(count和对应的FPKM,或者TPM文件)Read the SKCM expression profile (count and corresponding FPKM, or TPM file)
countstable <- read.table("easy_input_counts.txt",sep = "\t",row.names = 1,header = T,check.names = F,stringsAsFactors = F)
# FPKM与counts对应。如果你要把count转换为FPKM,可参考FPKM corresponds to counts. If you want to convert count to FPKM, you can refer to it FigureYa34count2FPKM
FPKM <- read.table("easy_input_FPKM.txt",sep = "\t",row.names = 1,header = T,check.names = F,stringsAsFactors = F) #
# 如果你只有FPKM数据,打算用FigureYa119Multiclasslimma做配对比较,则只需提供If you only have FPKM data and want to use FigureYa119Multiclasslimma for pairing comparison, you only need to provide it easy_input_FPKM.txt
# 运行下面这行,不运行本段第一行Run the following line, not the first line of this paragraph
#countstable <- FPKM
all(colnames(countstable) == colnames(FPKM)) # 检查一下样本名Check the sample name
## [1] TRUE
all(rownames(countstable) == rownames(FPKM)) # 检查一下基因名Check the gene name
## [1] TRUE
# 样本聚类Sample clustering #
input <- log2(FPKM + 1) # 数据对数化Data logarithm
hcs <- hclust(distanceMatrix(as.matrix(input), "pearson"), "ward.D") # 样本聚类,更多聚类请参阅Sample clustering, see More Clustering FigureYa91cluster_heatmap
group <- cutree(hcs, k = 4) # 样本分4类(实际上大于等于3类就可以)Samples are divided into 4 categories (in fact, it is sufficient to be greater than or equal to 3 categories)
group <- paste0("Cluster",as.numeric(group))
names(group) <- colnames(countstable)
#--------------#
# 配对差异表达 Paired differential expression#
# 注意:该处使用的是FigureYa120MulticlassedgeR,若需使用其他差异表达算法请参阅Note: FigureYa120MulticlassedgeR is used here, if you need to use other difference expression algorithms, please refer to itFigureYa118MulticlassDESeq2,FigureYa119Multiclasslimma
complist <- createList(group=group)
twoclassedgeR(res.path = ".", #所有配对差异表达结果都会输出在res.path路径下All paired differential expression results are output under the res.path path
countsTable = countstable,
prefix = "SKCM", #文件名以SKCM开头The file name starts with SKCM
complist = complist,
overwt = F)
## 1 : Cluster1_vs_Others exists and skipped;
## 2 : Cluster2_vs_Others exists and skipped;
## 3 : Cluster3_vs_Others exists and skipped;
## 4 : Cluster4_vs_Others exists and skipped;
#--------------------------------------#
# 创建样本注释和注释颜色以显示在热图上Create sample annotations and annotation colors to display on the heatmap #
annCol <- data.frame(Group = group,
row.names = names(group),
stringsAsFactors = F)
annColors <- list(Group = c("Cluster1"="yellow",
"Cluster2"="blue",
"Cluster3"="green",
"Cluster4"="red")) #如果你有更多类,就依此规律继续添加If you have more classes, continue to add them according to this pattern
#----------------------------#
# 计算特异性上调基因并画热图Calculate specific upregulation genes and draw heat maps #
sigheat(featdata = log2(FPKM + 1), # 数据已标准化Data is standardized
# 注意:DEfiles的顺序是要进行调整的,当第一次输出热图后观察对应亚型色块所在的位置调整文件读入顺序Note: The order of DEfiles needs to be adjusted, and when the heat map is output for the first time, observe the position of the corresponding subtype color block to adjust the file reading order
DEfiles = c("SKCM_edgeR_test_result.Cluster4_vs_Others.txt",
"SKCM_edgeR_test_result.Cluster1_vs_Others.txt",
"SKCM_edgeR_test_result.Cluster2_vs_Others.txt",
"SKCM_edgeR_test_result.Cluster3_vs_Others.txt"),
hcs = hcs, # 样本聚类信息Sample clustering information
annCol = annCol[colnames(FPKM),,drop = F], # 样本注释Sample notes
annColors = annColors, # 样本注释颜色Sample notes color
res.path = workdir,
fig.path = workdir,
outfile = "skcm_sigheatmap_upexpressed.pdf",
halfwidth = 2,
padjcut = 0.05,
log2fccut = 1,
norm = "none", # 不标准化Not standardized
dirct = "UP", # 上调Raised
width = 8,
height = 8)
#----------------------------#
# 计算特异性下调基因并画热图Calculate specific downregulation genes and draw heat maps #
sigheat(featdata = log2(FPKM + 1), # 数据已标准化Data is standardized
# 注意:DEfiles的顺序是要进行调整的,当第一次输出热图后观察对应亚型色块所在的位置调整文件读入顺序Note: The order of DEfiles needs to be adjusted, and when the heat map is output for the first time, observe the position of the corresponding subtype color block to adjust the file reading order
DEfiles = c("SKCM_edgeR_test_result.Cluster4_vs_Others.txt",
"SKCM_edgeR_test_result.Cluster1_vs_Others.txt",
"SKCM_edgeR_test_result.Cluster2_vs_Others.txt",
"SKCM_edgeR_test_result.Cluster3_vs_Others.txt"),
hcs = hcs, # 样本聚类信息Sample clustering information
annCol = annCol[colnames(FPKM),,drop = F], # 样本注释Sample notes
annColors = annColors, # 样本注释颜色Sample annotation color
res.path = workdir,
fig.path = workdir,
outfile = "skcm_sigheatmap_downexpressed.pdf",
halfwidth = 2,
padjcut = 0.05,
log2fccut = 1,
norm = "none", # 不标准化Not standardized
dirct = "DOWN", # 下调(不推荐)Downgrade (not recommended)
width = 8,
height = 8)
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## Random number generation:
## RNG: Mersenne-Twister
## Normal: Inversion
## Sample: Rounding
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 gplots_3.1.0 NMF_0.23.0
## [4] Biobase_2.48.0 BiocGenerics_0.34.0 rngtools_1.5
## [7] pkgmaker_0.31.1 registry_0.5-1 edgeR_3.30.3
## [10] limma_3.44.3 ClassDiscovery_3.3.12 oompaBase_3.2.9
## [13] cluster_2.1.0
##
## loaded via a namespace (and not attached):
## [1] gtools_3.8.2 locfit_1.5-9.4 tidyselect_1.1.0 xfun_0.17
## [5] reshape2_1.4.4 purrr_0.3.4 lattice_0.20-41 colorspace_1.4-1
## [9] vctrs_0.3.4 generics_0.0.2 oompaData_3.1.1 htmltools_0.5.0
## [13] yaml_2.2.1 rlang_0.4.7 pillar_1.4.6 glue_1.4.2
## [17] withr_2.2.0 plyr_1.8.6 foreach_1.5.0 lifecycle_0.2.0
## [21] stringr_1.4.0 munsell_0.5.0 gtable_0.3.0 caTools_1.18.0
## [25] codetools_0.2-16 evaluate_0.14 knitr_1.29 doParallel_1.0.15
## [29] Rcpp_1.0.5 KernSmooth_2.23-17 xtable_1.8-4 scales_1.1.1
## [33] ggplot2_3.3.2 digest_0.6.25 stringi_1.5.3 dplyr_1.0.2
## [37] grid_4.0.2 bibtex_0.4.2.3 bitops_1.0-6 tools_4.0.2
## [41] magrittr_1.5 tibble_3.0.3 crayon_1.3.4 pkgconfig_2.0.3
## [45] ellipsis_0.3.1 gridBase_0.4-7 assertthat_0.2.1 rmarkdown_2.3
## [49] iterators_1.0.12 R6_2.4.1 mclust_5.4.6 compiler_4.0.2